# import libraries
import numpy as np
import pandas as pd
# import dataset
foodInspection = pd.read_csv (r'C:/Users/yang7/OneDrive/Desktop/python/Food_Inspection.csv')
# get the categories
foodInspection['Category'].unique()
After go through the names of each category, I think carry out, caterer, and restaurant are categories for restaurants.
# Create the dummy variable on restaurants
for i in range(13701):
if foodInspection.iat[i,-5]=='Carry Out' or foodInspection.iat[i,-5]== 'Caterer' or foodInspection.iat[i,-5]=='Restaurant':
foodInspection.iat[i, -5] = 1
else:
foodInspection.iat[i, -5] = 0
foodInspection.head()
import datetime as dt
# Create a new column for the year of the inspection.
# Create a new column for the month of the inspection.
# Create a column for the year and month.
foodInspection=foodInspection.assign(Year = ' ')
foodInspection=foodInspection.assign(Month = ' ')
foodInspection=foodInspection.assign(Month_Year= ' ')
for i in range(13701):
foodInspection.iat[i, -3] = foodInspection.iat[i,6][-4:]
foodInspection.iat[i, -2] = foodInspection.iat[i,6][0:2]
foodInspection.iat[i, -1] = foodInspection.iat[i,6][0:2]+'/'+foodInspection.iat[i,6][-4:]
foodInspection.head()
# Convert the Inspection_date column into a datetime column
for i in range(13701):
foodInspection.iat[i,6]=dt.datetime.strptime(foodInspection.iat[i,6], '%m/%d/%Y')
# For each column with the type of compliance, create a dummy variable that is 1 if the establishment is out of compliance and 0 otherwise.
for i in range(8,24):
for j in range(13701):
if foodInspection.iat[j,i] == 'Out of Compliance':
foodInspection.iat[j,i] = 1
elif foodInspection.iat[j,i] == 'In Compliance':
foodInspection.iat[j,i] = 0
else:
foodInspection.iat[j,i] = np.nan
foodInspection.head()
#Create a new column that contains the number of violations for that inspection (the number of categories where the establishment was not in compliance).
foodInspection=foodInspection.assign(Number_Of_Violations =0)
for j in range(13701):
count=0
for i in range(8,24):
if foodInspection.iat[j,i] == 1 or foodInspection.iat[j,i] == 0 :
count=count+foodInspection.iat[j,i]
foodInspection.iat[j, -1] = count
foodInspection.head()
# Create a dummy variable that is 1 if the establishment is out of compliance in any category.
foodInspection=foodInspection.assign(Out_Of_Compliance = 0)
for i in range(13701):
if foodInspection.iat[i, -2] != 0:
foodInspection.iat[i, -1] = 1
foodInspection.head()
# For establishments with multiple inspections, create a new DataFrame in wide format. Keep only the establishment ID, Category, Inspection_date, and number of violations.
NewFoodInspection= foodInspection[['Establishment ID', 'Category', 'Inspection Date','Number_Of_Violations']].copy()
NewFoodInspection=NewFoodInspection.sort_values('Inspection Date')
# Reshape from long to wide (pivot) such that each establishment is a row
NewFoodInspection['idx'] = NewFoodInspection.groupby('Establishment ID').cumcount()+1
NewFoodInspection = NewFoodInspection.pivot_table(index=['Establishment ID','Category'], columns='idx',
values=['Inspection Date', 'Number_Of_Violations'],aggfunc='first')
NewFoodInspection = NewFoodInspection.sort_index(axis=1, level=1)
NewFoodInspection.columns = [f'{x}/{y}' for x,y in NewFoodInspection.columns]
NewFoodInspection=NewFoodInspection.assign(Num_Inspection = 0)
for i in range(len(NewFoodInspection.index)) :
NewFoodInspection.iat[i,-1]=(34-NewFoodInspection.iloc[i].isnull().sum())/2
NewFoodInspection=NewFoodInspection[NewFoodInspection['Num_Inspection'] > 1]
NewFoodInspection
Make sure category is consistent within ID and resolve any discrepancies if necessary (i.e. each establishment has only one category).
# Create a table with the number of violations by violation type.
# Sort the table from the most common to least common violations.
restaurantsFood=foodInspection[foodInspection['Category'] == 1]
violations=pd.DataFrame(restaurantsFood.sum(axis = 0, skipna = True) )
violations=violations.iloc[6:22]
violations.rename(columns={0: 'Sum'}, inplace=True)
violations.sort_values('Sum', ascending=False)
Cold Holding Temperature is the most common violation.
# Prepare the dataset for the result table
comcount=[]
colnum=[]
for i in range(len(NewFoodInspection.index)):
n=33
while n >0:
if NewFoodInspection.iat[i,n]>0:
comcount.append('Never')
colnum.append(0)
break;
elif NewFoodInspection.iat[i,n]==0 and NewFoodInspection.iat[i,n-2]>0 :
comcount.append((n-1)/2)
colnum.append(((n-1)/2)+1)
break;
else:
n=n-2
NewFoodInspection['Times_to_complicant'] = comcount
NewFoodInspection['Colnum'] = colnum
NewFoodInspection.head()
# create a new dataframe with fixed size
compliant = pd.DataFrame(index=range(16),columns=range(17))
compliant.index = [f'{x+2} inspections' for x in compliant.index]
compliant.columns = [f'After {y} Reinspections' for y in compliant.columns]
compliant.insert(0, "Never compliant", np.nan, True)
# Create a table where each row is the number of inspections a restaurant has had and the columns are the number of reinspections until the establishment becomes compliant.
grouped = NewFoodInspection.groupby(by=['Num_Inspection','Colnum'])
for name, group in grouped:
compliant.iat[int(name[0]-2),int(name[1])]=int(len(group))
grouped2 = NewFoodInspection.groupby(by=['Num_Inspection'])
compliant=compliant.assign(Total = 0)
for name, group in grouped2:
compliant.iat[int(name-2),-1]=len(group)
grouped3 = NewFoodInspection.groupby(by=['Colnum'])
compliant.loc['Total']=np.nan
for name, group in grouped3:
compliant.iat[-1,int(name)]=int(len(group))
compliant = compliant.replace(np.nan, 0)
compliant.iat[-1,-1]=int(sum(compliant['Total']))
compliant
There are 3390 total restaurants haven had multiple inspections. There are 14 restaurants had over 10 inspections, and maximum is even 17. The restaurants that had more inspections tend to never compliant or be compliant after a long time. The inspection times has a positive relationship with the time that the restaurants become compliant.
import matplotlib.pyplot as plt
#limit the objects to restaurants
# Create a bar graph showing the results of 2.1
import seaborn as sns
plt.figure(figsize=(8,8))
sns.set(style="whitegrid")
sns.barplot(violations.index,'Sum', data=violations)
plt.ylabel('The Count of Violations')
plt.title('The Common Level of Violations')
plt.xticks(rotation='vertical')
# Create a line graph that shows the percent of restaurant inspections that have at least one violation by month and year.
grouped4 = restaurantsFood.groupby(by=['Month_Year'])
inspectionNumMonth=[]
for name, group in grouped4:
inspectionNumMonth.append([name,len(group)])
df = pd.DataFrame (inspectionNumMonth)
df.columns = ['Month_Year','Num']
restaurantsFoodViolations=restaurantsFood[restaurantsFood['Number_Of_Violations'] >= 1]
grouped5 = restaurantsFoodViolations.groupby(by=['Month_Year'])
ViolationsNumMonth=[]
for name, group in grouped5:
ViolationsNumMonth.append([name,len(group)])
df2 = pd.DataFrame (ViolationsNumMonth)
df2.columns = ['Month_Year','Num']
# join two dfs and generate a new percentage column.
joinData=pd.merge(df, df2, on='Month_Year')
joinData=joinData.assign(Percentage = np.nan)
for i in range(len(joinData.index)) :
joinData.iat[i,-1]=joinData.iat[i,2]/joinData.iat[i,1]
joinData.iat[i,0]=dt.datetime.strptime(joinData.iat[i,0], '%m/%Y')
joinData=joinData.sort_values('Month_Year')
joinData
plt.figure(figsize = (8,5))
sns.set(style="darkgrid")
line=sns.lineplot(x='Month_Year', y="Percentage",data=joinData)
for item in line.get_xticklabels():
item.set_rotation(60)
plt.xlabel("Time", fontsize = 15)
plt.ylabel('The Percentage of Restaurants that Have Violations')
plt.title('The Change of Strict Level on Inspections Over Time')
plt.show()
I do not think inspection is getting easier or harder over time, but there is certainly a pattern shown. The percentage of restaurants that have violations increases to a peak every summer. When temperture is high, the chance they get violations increases. In 2018-11 and 2019-04, most restaurants past the inspection.
import plotly.graph_objects as go
# Your mapbox token
mapbox_access_token = 'pk.eyJ1Ijoid2FueXVuLXlhbmciLCJhIjoiY2syb3E4cTU5MTZhbDNtbzNyejRxZDAzbSJ9.V9aZq1zuZ7bovxHrjfce6g'
# Limits the points in the united states
restaurantsFoodPass=restaurantsFood[restaurantsFood['Number_Of_Violations'] == 0]
restaurantsFoodPass=restaurantsFoodPass[restaurantsFoodPass['Longitude'] >= -125]
restaurantsFoodPass=restaurantsFoodPass[restaurantsFoodPass['Longitude'] <= -70]
restaurantsFoodViolations=restaurantsFood[restaurantsFood['Number_Of_Violations'] >= 1]
restaurantsFoodViolations=restaurantsFoodViolations[restaurantsFoodViolations['Longitude'] >= -125]
restaurantsFoodViolations=restaurantsFoodViolations[restaurantsFoodViolations['Longitude'] <= -70]
# Add a column saying 'No violations' to the restaurants that do not have violations:
restaurantsFoodPass['Tag']='No violations'
# Add a column with a list of violations to the restaurants that do have violations:
restaurantsFoodViolations=restaurantsFoodViolations.assign(Violations ='')
for j in range(3617):
violations=[]
for i in range(8,24):
if restaurantsFoodViolations.iat[j,i] == 1:
violations.append(restaurantsFoodViolations.columns[i])
restaurantsFoodViolations.iat[j, -1] = violations
restaurantsFoodViolations
restaurantsFoodPass['Longitude'].describe()
restaurantsFoodPass['Latitude'].describe()
restaurants_map_data1 = go.Scattermapbox(
lon = restaurantsFoodPass['Longitude'],
lat = restaurantsFoodPass['Latitude'],
text = restaurantsFoodPass['Tag'],
hoverinfo='text',
mode = 'markers',
marker = dict(
color = 'blue',
symbol = 'circle',
opacity = .5
),
name = "Restaurants without violations"
)
restaurants_map_data2 = go.Scattermapbox(
lon = restaurantsFoodViolations['Longitude'],
lat = restaurantsFoodViolations['Latitude'],
text = restaurantsFoodViolations['Violations'],
hoverinfo='text',
mode = 'markers',
marker = dict(
color = 'red',
symbol = 'circle',
opacity = .5
),
name = "Restaurants with violations"
)
restaurants_map_layout = go.Layout(
title = 'Restaurants Health Inspection',
mapbox=go.layout.Mapbox(
accesstoken=mapbox_access_token,
zoom=1
)
)
restaurants_map = go.Figure(data=[restaurants_map_data1, restaurants_map_data2], layout=restaurants_map_layout)
restaurants_map.update_layout(
hovermode='closest',
mapbox=go.layout.Mapbox(
accesstoken=mapbox_access_token,
bearing=0,
center=go.layout.mapbox.Center(
lat=39.074284,
lon= -77.118721
),
pitch=0,
zoom=9
)
)
restaurants_map.show()
The restaurants that have violations tend to be located along the traffic lines and in cities. Most restaurants with violations in Rockville and Wheaton have cold holding temperature issue. However, in silver springs, there are more restaurants that have rodent and insect problem.